Os Pacotes

### Reading Data
require('readr')

### Data Wrangling
require('dplyr')
require('tidyr')
require('tibble')
require('DMwR') ## KNN Imputation

### Visualization
require('ggplot2')
require('Amelia')
require('plotly')

Os Dados

train <- read_csv("./data/train.csv")
IdTrain <- train$Id
YTrain <- train$Target
XTrain <- train %>% dplyr::select(-Id, -Target)

test <- read_csv("./data/test.csv")
IdTest <- test$Id
XTest <- test %>% dplyr::select(-Id)

Summaries

XTrain %>% dplyr::glimpse()
## Observations: 1,318
## Variables: 83
## $ V1  <int> 2, 3, 4, 3, 3, 3, 3, 4, 3, 5, 4, 2, 3, 2, 2, 3, 3, 3, 3, 3...
## $ V2  <int> 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 5, 1, 5, 5, 5, 5, 5, 5...
## $ V3  <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 5, 5, 5, 5...
## $ V4  <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, NA,...
## $ V5  <int> 303, 180, 621, 518, 840, 532, 260, 554, 328, 444, 506, 520...
## $ V6  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ V7  <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ V8  <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 5, 1, 1, 1, 1, 1...
## $ V9  <int> 5, 7, 13, 25, 6, 6, 13, 9, 6, 5, 17, 12, 21, 18, 1, 8, 9, ...
## $ V10 <int> 5, 7, 8, 7, 7, 7, 5, 7, 5, 6, 5, 5, 8, 5, 8, 6, 6, 7, 7, 5...
## $ V11 <int> 0, 0, 207, 47, 33, 74, 0, 54, 0, 168, 211, 0, 50, 0, 20, 3...
## $ V12 <int> 5, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5...
## $ V13 <int> 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 6, 2, 2, 6, 2, 2, 2, 2, 2, 6...
## $ V14 <int> 10708, 11500, 15865, 17542, 10652, 9135, 7390, 8795, 8385,...
## $ V15 <int> 0, 1037, 0, 651, 0, 0, 0, 1276, 0, 1111, 0, 0, 0, 0, 0, 0,...
## $ V16 <int> 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0...
## $ V17 <int> 476, 0, 81, 220, 160, 192, 0, 224, 210, 133, 0, 0, 322, 0,...
## $ V18 <int> 1, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1...
## $ V19 <int> 14, 4, 14, 14, 13, 13, 14, 13, 7, 7, 10, 7, 13, 15, 13, 7,...
## $ V20 <int> 1867, 2057, 2217, 2167, 1494, 1536, 1098, 2256, 985, 2087,...
## $ V21 <int> 5, 7, 6, 7, 5, 5, 7, 5, 8, 8, 5, 5, 5, 6, 5, 7, 5, 6, 6, 5...
## $ V22 <int> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3...
## $ V23 <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...
## $ V24 <int> 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7...
## $ V25 <int> 142, 322, 224, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 144, 0, 0,...
## $ V26 <int> 3, 3, 5, 3, 3, NA, NA, 5, NA, NA, NA, NA, 3, 3, 5, 2, 5, 5...
## $ V27 <int> 3, 3, 3, 3, 4, 2, 2, 3, 2, 2, 2, 3, 4, 3, 2, 2, 3, 2, 3, 3...
## $ V28 <int> 2, NA, NA, 3, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ V29 <int> 2, 2, 3, 2, 3, 3, 2, 3, 2, 2, 2, 2, 3, 2, 3, 2, 3, 2, 3, 2...
## $ V30 <int> 2, 6, 5, 1, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 1, 6...
## $ V31 <int> 3, 1, 1, 2, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 2, 1, 1, 1...
## $ V32 <int> 1955, 1936, 1970, 1974, 2006, 2002, 1955, 2000, 1977, 1966...
## $ V33 <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4...
## $ V34 <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...
## $ V35 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ V36 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ V37 <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ V38 <int> 1, 1, 3, 2, 2, 2, 3, 1, 3, 1, 3, 2, 2, 3, 1, 2, 1, 3, 2, 3...
## $ V39 <int> 1617, 1017, 2217, 1192, 1494, 1536, 1098, 952, 985, 832, 1...
## $ V40 <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ V41 <int> 4, 3, 4, 4, 3, 3, 4, 3, 3, 4, 4, 4, 1, 4, 3, 4, 3, 4, 3, 4...
## $ V42 <int> NA, 86, 95, NA, 91, 70, NA, NA, 65, NA, 87, 100, 85, 56, 4...
## $ V43 <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4...
## $ V44 <int> 2, 2, 6, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...
## $ V45 <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 5, 5, 1, 5, 5, 2, 3...
## $ V46 <int> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 1, 9, 1, 3, 3, 3, 3, 5, 2...
## $ V47 <int> 379, 223, 351, 125, 0, 810, 902, 300, 595, 568, 0, 0, 1220...
## $ V48 <int> 20, 50, 20, 60, 20, 20, 20, 60, 80, 60, 90, 20, 20, 30, 12...
## $ V49 <int> 3, 3, 3, 3, 3, 3, 4, 3, 3, 4, 4, 4, 3, 4, 3, 4, 4, 4, 3, 4...
## $ V50 <dbl> 1.436426e+05, 4.972889e+04, 1.232003e+05, 1.562537e+04, -6...
## $ V51 <int> 3, 1, 3, 6, 3, 3, 3, 6, 8, 6, 3, 3, 3, 3, 3, 3, 6, 3, 6, 3...
## $ V52 <int> 4, 2, 1, 2, 2, 2, 4, 2, 2, 2, 2, 4, 4, 2, 2, 4, 2, 4, 2, 2...
## $ V53 <int> 4, 4, 2, 2, 1, 1, 4, 4, 1, 2, 4, 4, 3, 4, 4, 4, 2, 4, 1, 4...
## $ V54 <int> 470, 794, 1043, 36, 1494, 726, 196, 652, 390, 264, 1656, 9...
## $ V55 <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ V56 <int> 15, 4, 15, 15, 14, 14, 15, 14, 7, 7, 11, 7, 14, 16, 14, 7,...
## $ V57 <int> 1, 3, 1, 5, 1, 1, 5, 1, 5, 3, 5, 1, 1, 3, 1, 5, 3, 2, 1, 5...
## $ V58 <int> 1, 1, 2, 2, 3, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1...
## $ V59 <int> 1993, 1987, 1970, 2003, 2007, 2003, 1955, 2000, 1977, 2007...
## $ V60 <int> 1, 1, 1, 1, 1, 4, 1, 1, 4, 1, 1, 4, 1, 4, 4, 1, 1, 4, 2, 4...
## $ V61 <int> 49, 36, 64, 81, 49, 49, 36, 64, 36, 81, 64, 16, 64, 25, 36...
## $ V62 <int> 1867, 1020, 2217, 1516, 1494, 1536, 1098, 980, 985, 976, 1...
## $ V63 <int> 7, 6, 8, 9, 7, 7, 6, 8, 6, 9, 8, 4, 8, 5, 6, 6, 7, 6, 6, 5...
## $ V64 <int> 4, 5, 1, 4, 6, 3, 1, 3, 3, 4, 6, 6, 3, 6, 3, 2, 3, 1, 3, 1...
## $ V65 <int> 3, 2, 3, 3, 1, 2, 2, 3, 1, 1, 2, 3, 2, 1, 1, 1, 1, 3, 3, 2...
## $ V66 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ V67 <int> 4, 3, 3, 3, 3, 3, 4, 3, 4, 3, 4, 4, 3, 4, 3, 4, 4, 4, 3, 4...
## $ V68 <int> 768, 0, 823, 1031, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V69 <int> 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0...
## $ V70 <int> 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 5, 5, 5, 5...
## $ V71 <dbl> 114661264, 132250000, 251698225, 307721764, 113465104, 834...
## $ V72 <int> 0, 0, 0, 0, 306, 113, 151, 0, 220, 157, 564, 0, 420, 0, 16...
## $ V73 <int> 1, 9, 9, 9, 7, 9, 9, 9, 9, 9, 9, 9, 7, 9, 9, 9, 9, 9, 5, 9...
## $ V74 <int> 11, 6, 10, 7, 8, 12, 7, 4, 11, 7, 11, 2, 5, 3, 4, 5, 7, 9,...
## $ V75 <int> 2009, 2006, 2007, 2007, 2007, 2008, 2008, 2009, 2008, 2007...
## $ V76 <int> 5, 5, 5, 5, 6, 5, 5, 5, 5, 5, 5, 5, 6, 5, 5, 5, 5, 5, 5, 5...
## $ V77 <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...
## $ V78 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 130, 0, 0, 0, 0, 0,...
## $ V79 <int> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3...
## $ V80 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ V81 <int> 1955, 1936, 1970, 1974, 2006, 2002, 1955, 2000, 1977, 1966...
## $ V82 <int> 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1...
## $ V83 <int> 4, 4, 2, 4, 4, 4, 4, 4, 2, 1, 4, 4, 4, 4, 4, 2, 4, 4, 4, 4...
XTrain %>% summary()
##        V1              V2              V3              V4       
##  Min.   :0.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:2.000   1st Qu.:5.000   1st Qu.:5.000   1st Qu.:1.000  
##  Median :3.000   Median :5.000   Median :5.000   Median :1.000  
##  Mean   :2.866   Mean   :4.689   Mean   :4.904   Mean   :1.476  
##  3rd Qu.:3.000   3rd Qu.:5.000   3rd Qu.:5.000   3rd Qu.:2.000  
##  Max.   :8.000   Max.   :5.000   Max.   :5.000   Max.   :2.000  
##                  NA's   :1       NA's   :72      NA's   :1236   
##        V5               V6                V7              V8       
##  Min.   :   0.0   Min.   :    0.0   Min.   :0.000   Min.   :1.000  
##  1st Qu.: 326.2   1st Qu.:    0.0   1st Qu.:1.000   1st Qu.:1.000  
##  Median : 476.5   Median :    0.0   Median :1.000   Median :1.000  
##  Mean   : 470.9   Mean   :   41.5   Mean   :1.045   Mean   :1.478  
##  3rd Qu.: 576.0   3rd Qu.:    0.0   3rd Qu.:1.000   3rd Qu.:1.000  
##  Max.   :1418.0   Max.   :15500.0   Max.   :3.000   Max.   :5.000  
##                                                                    
##        V9             V10              V11             V12       
##  Min.   : 1.00   Min.   : 1.000   Min.   :  0.0   Min.   :1.000  
##  1st Qu.: 8.00   1st Qu.: 5.000   1st Qu.:  0.0   1st Qu.:5.000  
##  Median :13.00   Median : 6.000   Median : 24.0   Median :5.000  
##  Mean   :13.16   Mean   : 6.111   Mean   : 45.9   Mean   :4.874  
##  3rd Qu.:18.00   3rd Qu.: 7.000   3rd Qu.: 66.0   3rd Qu.:5.000  
##  Max.   :25.00   Max.   :10.000   Max.   :547.0   Max.   :5.000  
##                                                   NA's   :72     
##       V13             V14              V15              V16        
##  Min.   :1.000   Min.   :  1300   Min.   :   0.0   Min.   :0.0000  
##  1st Qu.:2.000   1st Qu.:  7558   1st Qu.:   0.0   1st Qu.:0.0000  
##  Median :2.000   Median :  9502   Median :   0.0   Median :0.0000  
##  Mean   :3.268   Mean   : 10549   Mean   : 350.6   Mean   :0.3832  
##  3rd Qu.:6.000   3rd Qu.: 11645   3rd Qu.: 731.5   3rd Qu.:1.0000  
##  Max.   :6.000   Max.   :215245   Max.   :2065.0   Max.   :2.0000  
##  NA's   :72                                                        
##       V17             V18             V19             V20      
##  Min.   :  0.0   Min.   :0.000   Min.   : 1.00   Min.   : 334  
##  1st Qu.:  0.0   1st Qu.:1.000   1st Qu.: 9.00   1st Qu.:1132  
##  Median :  0.0   Median :2.000   Median :13.00   Median :1470  
##  Mean   : 93.7   Mean   :1.565   Mean   :10.65   Mean   :1520  
##  3rd Qu.:168.0   3rd Qu.:2.000   3rd Qu.:13.00   3rd Qu.:1778  
##  Max.   :857.0   Max.   :3.000   Max.   :15.00   Max.   :5642  
##                                                                
##       V21             V22             V23             V24       
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:5.000   1st Qu.:3.000   1st Qu.:2.000   1st Qu.:7.000  
##  Median :5.000   Median :3.000   Median :2.000   Median :7.000  
##  Mean   :5.596   Mean   :2.857   Mean   :1.939   Mean   :6.751  
##  3rd Qu.:6.000   3rd Qu.:3.000   3rd Qu.:2.000   3rd Qu.:7.000  
##  Max.   :9.000   Max.   :3.000   Max.   :2.000   Max.   :7.000  
##                                                                 
##       V25              V26             V27            V28       
##  Min.   :  0.00   Min.   :1.000   Min.   :1.00   Min.   :1.000  
##  1st Qu.:  0.00   1st Qu.:3.000   1st Qu.:2.00   1st Qu.:2.000  
##  Median :  0.00   Median :3.000   Median :3.00   Median :3.000  
##  Mean   : 15.12   Mean   :3.737   Mean   :2.76   Mean   :2.414  
##  3rd Qu.:  0.00   3rd Qu.:5.000   3rd Qu.:3.00   3rd Qu.:3.000  
##  Max.   :480.00   Max.   :5.000   Max.   :4.00   Max.   :4.000  
##                   NA's   :622     NA's   :8      NA's   :1069   
##       V29             V30             V31              V32      
##  Min.   :1.000   Min.   :1.000   Min.   :0.0000   Min.   :1900  
##  1st Qu.:2.000   1st Qu.:6.000   1st Qu.:0.0000   1st Qu.:1961  
##  Median :2.000   Median :6.000   Median :1.0000   Median :1980  
##  Mean   :2.392   Mean   :5.715   Mean   :0.6168   Mean   :1978  
##  3rd Qu.:3.000   3rd Qu.:6.000   3rd Qu.:1.0000   3rd Qu.:2002  
##  Max.   :6.000   Max.   :6.000   Max.   :3.0000   Max.   :2010  
##                  NA's   :31                       NA's   :72    
##       V33             V34             V35               V36         
##  Min.   :1.000   Min.   :1.000   Min.   :  0.000   Min.   :  0.000  
##  1st Qu.:4.000   1st Qu.:2.000   1st Qu.:  0.000   1st Qu.:  0.000  
##  Median :4.000   Median :2.000   Median :  0.000   Median :  0.000  
##  Mean   :3.775   Mean   :1.996   Mean   :  2.496   Mean   :  5.724  
##  3rd Qu.:4.000   3rd Qu.:2.000   3rd Qu.:  0.000   3rd Qu.:  0.000  
##  Max.   :4.000   Max.   :2.000   Max.   :648.000   Max.   :572.000  
##                                                                     
##       V37         V38            V39            V40             V41       
##  Min.   :1   Min.   :1.00   Min.   :   0   Min.   :1.000   Min.   :1.000  
##  1st Qu.:1   1st Qu.:1.00   1st Qu.: 799   1st Qu.:3.000   1st Qu.:3.000  
##  Median :1   Median :2.00   Median : 992   Median :3.000   Median :3.000  
##  Mean   :1   Mean   :2.18   Mean   :1061   Mean   :2.884   Mean   :3.269  
##  3rd Qu.:1   3rd Qu.:3.00   3rd Qu.:1292   3rd Qu.:3.000   3rd Qu.:4.000  
##  Max.   :1   Max.   :3.00   Max.   :6110   Max.   :4.000   Max.   :4.000  
##              NA's   :72                    NA's   :1275    NA's   :30     
##       V42              V43             V44             V45       
##  Min.   : 21.00   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.: 60.00   1st Qu.:4.000   1st Qu.:2.000   1st Qu.:3.000  
##  Median : 70.00   Median :4.000   Median :2.000   Median :5.000  
##  Mean   : 70.34   Mean   :4.029   Mean   :2.077   Mean   :4.017  
##  3rd Qu.: 80.00   3rd Qu.:4.000   3rd Qu.:2.000   3rd Qu.:5.000  
##  Max.   :313.00   Max.   :5.000   Max.   :8.000   Max.   :5.000  
##  NA's   :232                                                     
##       V46             V47              V48             V49       
##  Min.   :1.000   Min.   :   0.0   Min.   : 20.0   Min.   :1.000  
##  1st Qu.:3.000   1st Qu.:   0.0   1st Qu.: 20.0   1st Qu.:3.000  
##  Median :3.000   Median : 384.5   Median : 50.0   Median :4.000  
##  Mean   :3.027   Mean   : 444.7   Mean   : 56.3   Mean   :3.537  
##  3rd Qu.:3.000   3rd Qu.: 706.0   3rd Qu.: 70.0   3rd Qu.:4.000  
##  Max.   :9.000   Max.   :5644.0   Max.   :190.0   Max.   :4.000  
##                                                                  
##       V50                V51             V52             V53       
##  Min.   :      -3   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:       1   1st Qu.:3.000   1st Qu.:2.000   1st Qu.:3.000  
##  Median :  147840   Median :3.000   Median :2.000   Median :4.000  
##  Mean   :  404778   Mean   :4.025   Mean   :2.429   Mean   :3.277  
##  3rd Qu.:  498436   3rd Qu.:6.000   3rd Qu.:2.000   3rd Qu.:4.000  
##  Max.   :31854738   Max.   :8.000   Max.   :6.000   Max.   :4.000  
##                                                     NA's   :31     
##       V54              V55            V56             V57       
##  Min.   :   0.0   Min.   :1.00   Min.   : 1.00   Min.   :1.000  
##  1st Qu.: 237.2   1st Qu.:1.25   1st Qu.: 9.00   1st Qu.:1.000  
##  Median : 484.0   Median :2.00   Median :14.00   Median :1.000  
##  Mean   : 570.0   Mean   :2.00   Mean   :11.37   Mean   :2.524  
##  3rd Qu.: 808.0   3rd Qu.:2.75   3rd Qu.:14.00   3rd Qu.:5.000  
##  Max.   :2336.0   Max.   :3.00   Max.   :16.00   Max.   :5.000  
##                   NA's   :1312                                  
##       V58             V59            V60             V61        
##  Min.   :0.000   Min.   :1950   Min.   :1.000   Min.   :  4.00  
##  1st Qu.:1.000   1st Qu.:1966   1st Qu.:1.000   1st Qu.: 25.00  
##  Median :2.000   Median :1994   Median :4.000   Median : 36.00  
##  Mean   :1.763   Mean   :1985   Mean   :2.936   Mean   : 45.29  
##  3rd Qu.:2.000   3rd Qu.:2004   3rd Qu.:4.000   3rd Qu.: 49.00  
##  Max.   :4.000   Max.   :2010   Max.   :4.000   Max.   :196.00  
##                                                                 
##       V62              V63              V64             V65       
##  Min.   : 334.0   Min.   : 2.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.: 882.5   1st Qu.: 5.000   1st Qu.:2.000   1st Qu.:1.000  
##  Median :1088.0   Median : 6.000   Median :3.000   Median :2.000  
##  Mean   :1163.5   Mean   : 6.527   Mean   :3.722   Mean   :1.996  
##  3rd Qu.:1389.8   3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:3.000  
##  Max.   :4692.0   Max.   :14.000   Max.   :6.000   Max.   :3.000  
##                                    NA's   :30                     
##       V66              V67             V68               V69        
##  Min.   :0.0000   Min.   :1.000   Min.   :   0.00   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:   0.00   1st Qu.:0.0000  
##  Median :0.0000   Median :4.000   Median :   0.00   Median :0.0000  
##  Mean   :0.0569   Mean   :3.338   Mean   :  46.03   Mean   :0.4256  
##  3rd Qu.:0.0000   3rd Qu.:4.000   3rd Qu.:   0.00   3rd Qu.:1.0000  
##  Max.   :2.0000   Max.   :4.000   Max.   :1474.00   Max.   :3.0000  
##                                                                     
##       V70             V71                 V72              V73       
##  Min.   :1.000   Min.   :1.690e+06   Min.   :   0.0   Min.   :1.000  
##  1st Qu.:5.000   1st Qu.:5.713e+07   1st Qu.:   0.0   1st Qu.:9.000  
##  Median :5.000   Median :9.028e+07   Median :   0.0   Median :9.000  
##  Mean   :4.727   Mean   :2.117e+08   Mean   : 104.0   Mean   :8.517  
##  3rd Qu.:5.000   3rd Qu.:1.356e+08   3rd Qu.: 165.8   3rd Qu.:9.000  
##  Max.   :5.000   Max.   :4.633e+10   Max.   :1600.0   Max.   :9.000  
##                                      NA's   :8                       
##       V74              V75            V76             V77       
##  Min.   : 1.000   Min.   :2006   Min.   :1.000   Min.   :1.000  
##  1st Qu.: 5.000   1st Qu.:2007   1st Qu.:5.000   1st Qu.:2.000  
##  Median : 6.000   Median :2008   Median :5.000   Median :2.000  
##  Mean   : 6.296   Mean   :2008   Mean   :4.782   Mean   :2.037  
##  3rd Qu.: 8.000   3rd Qu.:2009   3rd Qu.:5.000   3rd Qu.:2.000  
##  Max.   :12.000   Max.   :2010   Max.   :6.000   Max.   :6.000  
##                                                                 
##       V78              V79            V80               V81      
##  Min.   :  0.00   Min.   :1.00   Min.   :  0.000   Min.   :1872  
##  1st Qu.:  0.00   1st Qu.:3.00   1st Qu.:  0.000   1st Qu.:1954  
##  Median :  0.00   Median :3.00   Median :  0.000   Median :1972  
##  Mean   : 22.44   Mean   :3.01   Mean   :  3.628   Mean   :1971  
##  3rd Qu.:  0.00   3rd Qu.:3.00   3rd Qu.:  0.000   3rd Qu.:2000  
##  Max.   :552.00   Max.   :8.00   Max.   :508.000   Max.   :2010  
##                                                                  
##       V82            V83       
##  Min.   :1.00   Min.   :1.000  
##  1st Qu.:1.00   1st Qu.:4.000  
##  Median :1.00   Median :4.000  
##  Mean   :1.06   Mean   :3.805  
##  3rd Qu.:1.00   3rd Qu.:4.000  
##  Max.   :3.00   Max.   :4.000  
##                 NA's   :30

Saida

YTrain %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   34900  130000  163000  181300  214000  755000
plotly::plot_ly(x = YTrain) %>%
  plotly::add_histogram()
plotly::plot_ly(y = YTrain, type = "box", boxpoints = "all", jitter = 0.5,
        pointpos = -1.8) 
plotly::plot_ly(x = 1:length(YTrain), y = sort(YTrain)) %>%
  plotly::add_lines() %>%
  plotly::add_markers()

Análise

Problemas na Entrada
  • NAs
  • Dados Fora de Escala e de Distribuições Desbalanceadas
  • Grande Número de Variáveis
  • Desconhecimento das Relações e Interações entre variáveis e saida
Problemas na Saida
  • Saída não linear
Possíveis Pré-Processamentos
NAs
  • Binarização de NAs
  • Substituição de NAs pela média
  • Substituição de NAs pela mediana
  • Substituição de NAs por moda
  • Substituição de NAs por valores extremos
Dados Fora de Escala e de Distribuições Desbalanceadas
  • Normalização dos Dados
  • Escalamento dos Dados
  • Transformações Logarítimicas
  • Transformações de dados contínuos para bins
Grande Número de Variáveis
  • PCA para variáveis contínuas
Desconhecimento das Relações e Interações entre variáveis e saida
  • Análise de Correlação
  • Relações não-Lineares entre entrada e saida

Pré-Processamentos

Variáveis com NAs

Quais são as variáveis e quanto de NAs elas possuem
trainMissingData <- XTrain %>% 
                      dplyr::mutate_all(is.na) %>%
                      dplyr::summarise_all(dplyr::funs(sum)) %>%
                      tidyr::gather(Var, NaCountTrain) %>%
                      dplyr::arrange(dplyr::desc(NaCountTrain)) %>%
                      dplyr::filter(NaCountTrain > 0)

testMissingData <- XTest %>% 
                    dplyr::mutate_all(is.na) %>%
                    dplyr::summarise_all(sum) %>%
                    tidyr::gather(Var, NaCountTest) %>%
                    dplyr::arrange(dplyr::desc(NaCountTest)) %>%
                    dplyr::filter(NaCountTest > 0)

nTrainningObs <- nrow(XTrain)
nTestObs <- nrow(XTest)
missingResultTable <- trainMissingData %>%
  dplyr::full_join(testMissingData, by = 'Var') %>%
  dplyr::mutate(NaCountTrainPerc = 100 * round(NaCountTrain / nTrainningObs, 4),
                NaCountTestPerc = 100 * round(NaCountTest / nTestObs, 4))
missingResultTable
## # A tibble: 19 x 5
##      Var NaCountTrain NaCountTest NaCountTrainPerc NaCountTestPerc
##    <chr>        <int>       <int>            <dbl>           <dbl>
##  1   V55         1312        1310            99.54           99.47
##  2   V40         1275        1268            96.74           96.28
##  3    V4         1236        1234            93.78           93.70
##  4   V28         1069        1069            81.11           81.17
##  5   V26          622         617            47.19           46.85
##  6   V42          232         235            17.60           17.84
##  7    V3           72          75             5.46            5.69
##  8   V12           72          75             5.46            5.69
##  9   V13           72          75             5.46            5.69
## 10   V32           72          75             5.46            5.69
## 11   V38           72          75             5.46            5.69
## 12   V30           31          38             2.35            2.89
## 13   V53           31          38             2.35            2.89
## 14   V41           30          37             2.28            2.81
## 15   V64           30          37             2.28            2.81
## 16   V83           30          37             2.28            2.81
## 17   V27            8           8             0.61            0.61
## 18   V72            8           8             0.61            0.61
## 19    V2            1           1             0.08            0.08
Vizualizando relação entre NAs
missingDataColNames <- missingResultTable$Var
XTrain %>% 
    dplyr::select( dplyr::one_of(missingDataColNames) ) %>%
    data.frame() %>%
    Amelia::missmap(col = c("black", "grey"))

XTest %>% 
    dplyr::select( dplyr::one_of(missingDataColNames) ) %>%
    data.frame() %>%
    Amelia::missmap(col = c("black", "grey"))

Vizualizando Disperção entre Valores não NAs
missingDataColNames <- missingResultTable$Var[1:6]
p <- XTrain %>% 
        dplyr::select( dplyr::one_of(missingDataColNames) ) %>%
        tidyr::gather(Var, Val) %>%
        dplyr::filter(!is.na(Val)) %>%
        dplyr::group_by(Var) %>%
        dplyr::mutate(Val = (Val - mean(Val))/sd(Val) ) %>%
        dplyr::ungroup() %>%
        ggplot2::ggplot(aes(x = Var, fill = Var, color = Var, y = Val))
plotly::ggplotly(p + ggplot2::geom_boxplot())
plotly::ggplotly(p + ggplot2::geom_point())
p <- XTest %>% 
        dplyr::select( dplyr::one_of(missingDataColNames) ) %>%
        tidyr::gather(Var, Val) %>%
        dplyr::filter(!is.na(Val)) %>%
        dplyr::group_by(Var) %>%
        dplyr::mutate(Val = (Val - mean(Val))/sd(Val) ) %>%
        dplyr::ungroup() %>%
        ggplot2::ggplot(aes(x = Var, fill = Var, color = Var, y = Val))
plotly::ggplotly(p + ggplot2::geom_boxplot())
plotly::ggplotly(p + ggplot2::geom_point())
preProcessNAs
NAToZeroVars <- missingResultTable %>%
    dplyr::filter(NaCountTrainPerc > 20.0) %>%
    dplyr::select(Var) %>%
    unlist() %>% as.character()

NNImputationVars <- missingResultTable %>%
    dplyr::filter(NaCountTrainPerc < 20.0) %>%
    dplyr::select(Var) %>%
    unlist() %>% as.character()


preProcessNAs <- function(Data, NAToZeroVars, NNImputationVars){
    auxColNames <- names(Data) %>% base::setdiff( c(NAToZeroVars, NNImputationVars ) ) 
    auxData <- Data %>% 
        dplyr::select_( .dots = auxColNames )
                     
    NAToZeroData <- Data %>%
        dplyr::select_(.dots = NAToZeroVars) %>%
        dplyr::mutate_all( dplyr::funs( ifelse(is.na(.), 0, .) ) 
                          )
    NNImputationData <- Data %>%
        dplyr::select_(.dots = NNImputationVars) %>%
        data.frame() %>%
        DMwR::knnImputation(k = 10)
    
    dplyr::bind_cols(auxData, NAToZeroData, NNImputationData) %>%
        data.frame()
}

XTrain <- XTrain %>%
            preProcessNAs(NAToZeroVars = NAToZeroVars, NNImputationVars = NNImputationVars)
XTest <- XTest %>% 
            preProcessNAs(NAToZeroVars = NAToZeroVars, NNImputationVars = NNImputationVars)

Tipos e Distribuições das Variaveis

Tipo Dos Dados

Plot Valores Únicos
p <- XTrain %>%
        tidyr::gather(key = Var, value = Val) %>%
        dplyr::group_by(Var) %>%
        dplyr::mutate(Val = ( Val - min(Val) ) / ( max(Val) - min(Val) ) ) %>%
        dplyr::ungroup() %>%
        data.frame() %>%
        ggplot2::ggplot(aes(x = Var, y = Val)) +
        ggplot2::geom_point()
p
## Warning: Removed 1318 rows containing missing values (geom_point).

Tabela de Valores Únicos
XTrain %>%
    tidyr::gather(key = Var, value = Val) %>%
    unique() %>%
    dplyr::group_by(Var) %>%
    dplyr::summarise(UniqueValuesCount = n()) %>%
    dplyr::arrange(UniqueValuesCount) %>%
    data.frame()
##    Var UniqueValuesCount
## 1  V37                 1
## 2  V23                 2
## 3  V34                 2
## 4  V16                 3
## 5  V22                 3
## 6   V4                 3
## 7  V65                 3
## 8  V66                 3
## 9  V82                 3
## 10 V18                 4
## 11 V31                 4
## 12 V33                 4
## 13 V49                 4
## 14 V55                 4
## 15 V60                 4
## 16 V67                 4
## 17 V69                 4
## 18  V7                 4
## 19  V2                 5
## 20 V28                 5
## 21 V40                 5
## 22 V43                 5
## 23 V45                 5
## 24 V57                 5
## 25 V58                 5
## 26 V70                 5
## 27 V75                 5
## 28  V8                 5
## 29 V26                 6
## 30 V29                 6
## 31 V52                 6
## 32 V76                 6
## 33 V77                 6
## 34 V24                 7
## 35 V35                 7
## 36  V1                 8
## 37 V44                 8
## 38 V51                 8
## 39 V79                 8
## 40 V21                 9
## 41 V46                 9
## 42 V73                 9
## 43 V10                10
## 44 V27                12
## 45 V61                12
## 46 V63                12
## 47 V74                12
## 48 V19                15
## 49 V48                15
## 50 V56                16
## 51 V80                19
## 52  V6                21
## 53 V36                22
## 54  V9                25
## 55 V41                27
## 56 V83                28
## 57 V30                31
## 58 V53                31
## 59 V64                34
## 60  V3                36
## 61 V12                42
## 62 V38                53
## 63 V59                61
## 64 V13                66
## 65 V25                70
## 66 V81               111
## 67 V78               114
## 68 V68               129
## 69 V32               165
## 70 V11               188
## 71 V17               260
## 72 V72               320
## 73 V42               336
## 74 V15               393
## 75  V5               428
## 76 V47               593
## 77 V39               684
## 78 V62               718
## 79 V54               740
## 80 V20               806
## 81 V14               978
## 82 V71               978
## 83 V50              1318
XTrain <- XTrain %>% dplyr::select(-V37)
XTest <- XTest %>% dplyr::select(-V37)
Distribuições dos Dados
p <- XTrain %>%
        tidyr::gather(key = Var, value = Val) %>%
        dplyr::group_by(Var) %>%
        dplyr::mutate(Val = ( Val - min(Val) ) / ( max(Val) - min(Val) ) ) %>%
        dplyr::ungroup() %>%
        data.frame() %>%
        ggplot2::ggplot(aes(x = Var, y = Val)) +
        ggplot2::geom_boxplot() + 
        ggplot2::coord_flip()
p

Colocando os Dados em Escala
preProcessScaleAndCenter
preProcessScaleAndCenter <- function(Data){
    Data %>% 
        dplyr::mutate_all( dplyr::funs( ( . - min(.) ) / ( max(.) - min(.) ) ) )
}

XTrain <- XTrain %>% preProcessScaleAndCenter()
XTest <- XTest %>% preProcessScaleAndCenter()
Diminuindo o Número de Variáveis via PCA
PCAModel <- prcomp(x = XTrain %>% data.matrix())
auxPCAData <- summary(PCAModel)$importance %>%
                t() %>%
                data.frame() %>%
                tibble::rownames_to_column(var = "Component") %>%
                dplyr::mutate(Component = factor(x = Component, 
                                                 levels = Component %>% as.character()))

p <- auxPCAData %>%
    ggplot2::ggplot(aes(x = Component, y = Standard.deviation)) +
    ggplot2::geom_bar(stat = 'identity')
plotly::ggplotly(p = p)
p <- auxPCAData %>%
    ggplot2::ggplot(aes(x = Component, y = Proportion.of.Variance)) +
    ggplot2::geom_bar(stat = 'identity')
plotly::ggplotly(p = p)
p <- auxPCAData %>%
    ggplot2::ggplot(aes(x = Component, y = Cumulative.Proportion)) +
    ggplot2::geom_bar(stat = 'identity')
plotly::ggplotly(p = p)
preProcessPCA
preProcessPCA <- function(Data, PCAModel, nComponents){
    predict(object = PCAModel, newdata = Data) %>%
        data.frame() %>%
        dplyr::select(1:nComponents)
}
XTrain <- XTrain %>% preProcessPCA(PCAModel = PCAModel, nComponents = 45)
XTest <- XTest %>% preProcessPCA(PCAModel = PCAModel, nComponents = 45)

Salvando Dados de Teste e Treinamento Processados

readr::write_csv(x = XTrain, path = 'data/processedTrainData.csv')
readr::write_csv(x = XTest, path = 'data/processedTestData.csv')
readr::write_csv(x = YTrain %>% data.frame(), path = 'data/TrainLabels.csv')